#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* buffer_convert_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
"#define GLOBAL_SIZE_3_DIMS __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
"#define MNN_DATA_FORMAT_NCHW 0\n"
"#define MNN_DATA_FORMAT_NHWC 1\n"
"#define MNN_DATA_FORMAT_NC4HW4 2\n"
"#define MNN_DATA_FORMAT_C4NHW4 3\n"
"__kernel void buffer_convert_to_buffer(GLOBAL_SIZE_3_DIMS\n"
" __global const INPUT_TYPE *input_ptr,\n"
" __private const int4 shape,// N C H W\n"
" __global OUTPUT_TYPE *output_ptr\n"
") {\n"
" int wh=get_global_id(0);\n"
" int c=get_global_id(1);\n"
" int n=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(wh,c,n);\n"
" int w=wh % shape.w;\n"
" int h=wh/shape.w;\n"
" int input_offset,output_offset;\n"
" \n"
"#if INPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
" input_offset=((n*shape.y+c)*shape.z+h)*shape.w+w;\n"
"#elif INPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
" input_offset=((n*shape.z+h)*shape.w+w)*shape.y+c;\n"
"#elif INPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
" input_offset=((((c/4)*shape.x+n)*shape.z+h)*shape.w+w)*4+(c % 4);\n"
"#endif\n"
"#if OUTPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
" output_offset=((n*shape.y+c)*shape.z+h)*shape.w+w;\n"
"#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
" output_offset=((n*shape.z+h)*shape.w+w)*shape.y+c;\n"
"#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
" output_offset=((((c/4)*shape.x+n)*shape.z+h)*shape.w+w)*4+(c % 4);\n"
"#endif\n"
" output_ptr[output_offset]=input_ptr[input_offset];\n"
"}\n"
"__kernel void buffer_copy_to_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global const INPUT_TYPE *input_ptr,\n"
" __global OUTPUT_TYPE *output_ptr,\n"
" __private const int size // N C H W\n"
") {\n"
" const int x=get_global_id(0);\n"
" const int y=get_global_id(1);\n"
" DEAL_NON_UNIFORM_DIM2(x,y);\n"
" const int offset=x << 2;\n"
"#ifdef PACK_LEAVE\n"
" if(offset+3 >= size){\n"
" for(int i=0; i<size-offset; ++i){\n"
" output_ptr[offset+i]=(OUTPUT_TYPE)input_ptr[offset+i];\n"
" }\n"
" } else {\n"
"#endif\n"
" vstore4(CONVERT_OUTPUT4(vload4(0,input_ptr+offset)),0,output_ptr+offset);\n"
"#ifdef PACK_LEAVE\n"
" }\n"
"#endif\n"
"}\n"
"// convert kernel : from buffer(oihw) to image(ic,oc/4,h,w,oc4)\n"
"__kernel void conv2d_filter_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input_ptr,\n"
" __private const int output_channel,\n"
" __private const int2 kernel_shape,\n"
" __private const int ic_h_w_size,\n"
" __private const int height_width_size,\n"
" __global FLOAT *output) {\n"
" int image_width_idx=get_global_id(0); // ic\n"
" int image_height_idx=get_global_id(1); // oc/4 h w\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int input_channel_4_idx=image_width_idx;\n"
" const int output_channel_4_idx=(image_height_idx/height_width_size)*4;\n"
" const int height_width_idx=image_height_idx % height_width_size;\n"
" const int buffer_height_idx=height_width_idx/kernel_shape.y;\n"
" const int buffer_width_idx=height_width_idx % kernel_shape.y;\n"
" const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size +\n"
" buffer_height_idx*kernel_shape.y+buffer_width_idx;\n"
" FLOAT4 output_values=0;\n"
" if (output_channel_4_idx<output_channel) {\n"
" const int remain_channel=output_channel-output_channel_4_idx;\n"
" if (remain_channel >= 4) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.w=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 3) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 2) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 1) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" }\n"
" }\n"
" const int out_offset=(image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*4;\n"
" vstore4(output_values,0,output+out_offset);\n"
"}\n"
"// convert kernel : from buffer(oihw) to image(oc/4 h w ,ic oc4)\n"
"__kernel void conv2d_filter_buffer_to_nc4hw4_buffer_floatin(GLOBAL_SIZE_2_DIMS\n"
" __global const float *input_ptr,\n"
" __private const int output_channel,\n"
" __private const int2 kernel_shape,\n"
" __private const int ic_h_w_size,\n"
" __private const int height_width_size,\n"
" __global FLOAT *output) {\n"
" int image_width_idx=get_global_id(0); // ic\n"
" int image_height_idx=get_global_id(1); // oc/4 h w\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int input_channel_4_idx=image_width_idx;\n"
" const int output_channel_4_idx=(image_height_idx/height_width_size)*4;\n"
" const int height_width_idx=image_height_idx % height_width_size;\n"
" const int buffer_height_idx=height_width_idx/kernel_shape.y;\n"
" const int buffer_width_idx=height_width_idx % kernel_shape.y;\n"
" const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size +\n"
" buffer_height_idx*kernel_shape.y+buffer_width_idx;\n"
" FLOAT4 output_values=0;\n"
" if (output_channel_4_idx<output_channel) {\n"
" const int remain_channel=output_channel-output_channel_4_idx;\n"
" if (remain_channel >= 4) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.w=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 3) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += ic_h_w_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 2) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset=mad24(1,ic_h_w_size,offset);\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 1) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" }\n"
" }\n"
" const int out_offset=(image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*4;\n"
" vstore4(output_values,0,output+out_offset);\n"
"}\n"
"// convert kernel from buffer(mihw) to image(ic/4,ic4 h w m)\n"
"// but now dw only support m == 1\n"
"__kernel void dw_filter_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input_ptr,\n"
" __private const int4 kernel_shape,//[1,Cout,fh,fw]\n"
" __private const int height_width_size,\n"
" __global FLOAT *output) {\n"
" const int image_width_idx=get_global_id(0);//fh*fw\n"
" const int image_height_idx=get_global_id(1);//UP_DIV(Cout,4)\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" FLOAT4 output_values=0;\n"
" if (kernel_shape.x == 1) {\n"
" const int input_channel_4_idx=image_height_idx*4;\n"
" const int buffer_height_idx=image_width_idx/kernel_shape.w;\n"
" const int buffer_width_idx=image_width_idx % kernel_shape.w;\n"
" const int buffer_offset =\n"
" mad24(mad24(input_channel_4_idx,kernel_shape.z,buffer_height_idx),kernel_shape.w,buffer_width_idx);\n"
" //input [1,Cout,fh,fw]\n"
" //index:[0,input_channel_4_idx,buffer_height_idx,buffer_width_idx]\n"
" const int remain_channel=kernel_shape.y-input_channel_4_idx;\n"
" if (input_channel_4_idx<kernel_shape.y) {\n"
" if (remain_channel >= 4) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.w=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 3) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 2) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 1) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" }\n"
" }\n"
" }\n"
" //output NC4HW4 [1,fw*fh,1,Cout/4]x oc4\n"
" //index: [0,image_width_idx,0,image_height_idx]\n"
" const int out_offset=(image_width_idx*((kernel_shape.y+3)/4)+image_height_idx)*4;\n"
" vstore4(output_values,0,output+out_offset);\n"
"}\n"
"__kernel void dw_filter_buffer_to_nc4hw4_buffer_floatin(GLOBAL_SIZE_2_DIMS\n"
" __global const float *input_ptr,\n"
" __private const int4 kernel_shape,//[1,Cout,fh,fw]\n"
" __private const int height_width_size,\n"
" __global FLOAT *output) {\n"
" const int image_width_idx=get_global_id(0);//fh*fw\n"
" const int image_height_idx=get_global_id(1);//UP_DIV(Cout,4)\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" FLOAT4 output_values=0;\n"
" if (kernel_shape.x == 1) {\n"
" const int input_channel_4_idx=image_height_idx*4;\n"
" const int buffer_height_idx=image_width_idx/kernel_shape.w;\n"
" const int buffer_width_idx=image_width_idx % kernel_shape.w;\n"
" const int buffer_offset =\n"
" mad24(mad24(input_channel_4_idx,kernel_shape.z,buffer_height_idx),kernel_shape.w,buffer_width_idx);\n"
" //input [1,Cout,fh,fw]\n"
" //index:[0,input_channel_4_idx,buffer_height_idx,buffer_width_idx]\n"
" const int remain_channel=kernel_shape.y-input_channel_4_idx;\n"
" if (input_channel_4_idx<kernel_shape.y) {\n"
" if (remain_channel >= 4) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.w=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 3) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.z=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 2) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" offset += height_width_size;\n"
" output_values.y=(FLOAT)(*(input_ptr+offset));\n"
" } else if (remain_channel == 1) {\n"
" int offset=buffer_offset;\n"
" output_values.x=(FLOAT)(*(input_ptr+offset));\n"
" }\n"
" }\n"
" }\n"
" //output NC4HW4 [1,fw*fh,1,Cout/4]x oc4\n"
" //index: [0,image_width_idx,0,image_height_idx]\n"
" const int out_offset=(image_width_idx*((kernel_shape.y+3)/4)+image_height_idx)*4;\n"
" vstore4(output_values,0,output+out_offset);\n"
"}\n"
;
#endif
}
